import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=12,8
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import FunctionTransformer
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import itertools
data=pd.read_excel('JOB WISE .xlsx')
data=data.dropna()
data
df1=data[['Job_Title','Company_Name','Posted_Date','Sector','Job']]
df1=df1.drop_duplicates()
df1['Posted_Date']=pd.to_datetime(df1['Posted_Date'])
sector=df1.groupby('Sector')['Sector'].count()
job=df1.groupby('Job')['Job'].count()
job=job.sort_values(ascending=False)
job
sector
df1['Sector'].replace(['Information Technology \n','Information Technonlogy'],'Information Technology',inplace=True)
df1['Sector'].replace(['Business, consulting and management','business, consulting and management','Business Counsulting and Management'],'Business Consulting and Management',inplace=True)
df1['Sector'].replace(['Marketing, Advertising, and PR','marketing, Advertising, and PR','Marketing, Advertising and PR','Marketing Advertising and PR'],'Marketing, Advertising and PR',inplace=True)
df1['Sector'].replace(['Accountancy, banking, and finance jobs','Finance','Accounting , Finance and Banking','Accounting Finance and Banking'],'Accounting, Finance and Banking',inplace=True)
df1['Sector'].replace(['Public service and Administration','Public services and Administration','public services and Administration'],'Public Service and Administration',inplace=True)
df1['Sector'].replace(['Science and Pharmaceuticals \n','Science and Pharmaceutical','Science and Pharmaceuticals'],'Science and Pharmaceuticals',inplace=True)
df1['Sector'].replace(['Leisure, Sports, and Tourism'],'Leisure, Sports and Tourism',inplace=True)
df1['Sector'].replace(['Transport and logistics','Trasport and Logistics','Transport And Logistics'],'Transport and Logistics',inplace=True)
df1['Sector'].replace(['Media and the Internet'],'Media and Internet',inplace=True)
df1['Sector'].replace(['Teacher, Training and Education'],'Teacher Training and Education',inplace=True)
df1['Sector'].replace(['Social care'],'Social Care',inplace=True)
df1['Sector'].replace(['Energy and utilities'],'Energy and Utilities',inplace=True)
df1['Sector'].replace(['Law enforcement and security'],'Law Enforcement and Security',inplace=True)
sector=df1.groupby('Sector')['Sector'].count()
sector.sort_values(ascending=False)
sector.count()
sec=sector.sort_values(ascending=False)
for i in sec.index:
d=df1[df1['Sector']==i]
job_sec=d.groupby('Job')['Job'].count()
print(i.upper()+'\n')
for j in job_sec.index:
print(j.capitalize())
print('\n__________________________\n')
job_date=pd.DataFrame(columns=['Job','Number of Sample'])
for i in job.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
job_samp=pd.DataFrame({"Job":[i],"Number of Sample":[d_it.count()]}, index=[0])
job_date=job_date.append(job_samp,ignore_index=True)
job_data=job_date.sort_values(by='Number of Sample', ascending=False)
job_data=job_data[job_data['Number of Sample']>=35]
job_pre=job_data.set_index(['Job'])
job_pre
fig, ax1 = plt.subplots(len(job.index[0:20]),figsize=(24,50))
#sns.set(font_scale=1)
plot=0
for i in job.index[0:20]:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
sns.set(font_scale=1)
ax1[plot].plot(d_it,label=i)
ax1[plot].legend()
plot=plot+1
fig_dims = (40, 30)
sns.set(font_scale=3)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(x=job.values[0:25],y=job.index[0:25],ax=ax)
plt.ylabel('Jobs',fontweight='bold')
ax.set_title('Job Postings\n',fontweight='bold')
from statsmodels.tsa.stattools import adfuller
stat=pd.DataFrame(columns=['Transformation','Job','Test Statistic','p-value','Critical Value (1%)','Critical Value (5%)'])
def test_stationarity(timeseries,Transformation,job):
sns.set(font_scale=1)
#Determing rolling statistics
rolmean=timeseries.rolling(window=10).mean()
rolstd=timeseries.rolling(window=10).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
d=dftest[4]
#print(d['1%'])
global stat
trans_stat=pd.DataFrame({"Transformation":[Transformation],"Job":[job],"Test Statistic":[dftest[0]],"p-value":[dftest[1]],"Critical Value (1%)":d['1%'],"Critical Value (5%)":d['5%']}, index=[0])
stat=stat.append(trans_stat,ignore_index=True)
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i.upper())
demand_log=np.log(d_it)
test_stationarity(d_it,'Log Transformation',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=MinMaxScaler(feature_range=(0,1))
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_nor=demand_nor.flatten()
demand_nor=pd.Series(demand_nor,index=index)
test_stationarity(demand_nor,'MaxMinScaler Transfromation (Normalization)',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
demand_values=demand_values.reshape(len(demand_values),1)
std_scaler=StandardScaler()
std_scaler_fit=std_scaler.fit(demand_values)
demand_std=std_scaler.transform(demand_values)
index=d_it.index
demand_std=demand_std.flatten()
demand_std=pd.Series(demand_std,index=index)
test_stationarity(demand_std,'StandardScaler Transformation (Standardization)',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_sqrt=np.sqrt(d_it)
test_stationarity(demand_sqrt,'Square Root Transformation',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_cbrt=np.cbrt(d_it)
test_stationarity(demand_cbrt,'Cube Root Transformation',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_diff=d_it.diff(periods=1)
demand_diff.dropna(inplace=True)
test_stationarity(demand_diff,'Differencing Transformation (Shift 1)',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_diff2=d_it.diff(periods=2)
demand_diff2.dropna(inplace=True)
test_stationarity(demand_diff2,'Differencing Transformation (Shift 2)',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_diff3=d_it.diff(periods=3)
demand_diff3.dropna(inplace=True)
test_stationarity(demand_diff3,'Differencing Transformation (Shift 3)',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=PowerTransformer(method='box-cox')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_pow_bc=demand_nor.flatten()
demand_pow_bc=pd.Series(demand_pow_bc,index=index)
test_stationarity(demand_pow_bc,'Power Transformation ( Box-Cox )',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=PowerTransformer(method='yeo-johnson')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_pow_yj=demand_nor.flatten()
demand_pow_yj=pd.Series(demand_pow_yj,index=index)
test_stationarity(demand_pow_yj,'Power Transformation ( Yeo-Johnson )',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=MaxAbsScaler()
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_abs=demand_nor.flatten()
demand_abs=pd.Series(demand_abs,index=index)
test_stationarity(demand_abs,'MaxAbsScaler Transformation',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=RobustScaler()
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_rs=demand_nor.flatten()
demand_rs=pd.Series(demand_rs,index=index)
test_stationarity(demand_rs,'Robust Scaler Transformation',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=QuantileTransformer(output_distribution='uniform')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_qtu=demand_nor.flatten()
demand_qtu=pd.Series(demand_qtu,index=index)
test_stationarity(demand_qtu,'Quantile Transformation ( Uniform )',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=QuantileTransformer(output_distribution='normal')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_qtn=demand_nor.flatten()
demand_qtn=pd.Series(demand_qtn,index=index)
test_stationarity(demand_qtn,'Quantile Transformation ( Normal )',i)
print('\n------------------------------------------------------\n')
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=FunctionTransformer(np.log1p)
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_ft=demand_nor.flatten()
demand_ft=pd.Series(demand_ft,index=index)
test_stationarity(demand_ft,'Function Transformation',i)
print('\n------------------------------------------------------\n')
DataFrame consisting Data regarding transformation
stat.head(25)
k=1
for i in job_pre.index:
soj=stat[stat['Job']==i]
print(str(k)+'] '+i.upper(),'\n')
soj=soj[(soj['p-value']<= 0.05)&((soj['Critical Value (1%)']>soj['Test Statistic'])&(soj['Critical Value (5%)']>soj['Test Statistic']))]
soj=soj.sort_values('p-value')
sojl=soj['Transformation'][0:3].values
j=1
for i in sojl:
print(str(j)+'. '+i)
j+=1
print("___________________________________________________________________________________________\n")
k+=1
p=d=q=range(0,5)
pdq=list(itertools.product(p,d,q))
aic_rsme=pd.DataFrame(columns=['Transformation','Job','AIC','RSME','p','d','q'])
def arima_model(ts,Transformation,job):
aic=[]
for param in pdq:
try:
demand_model=ARIMA(ts,order=param)
demand_model_fit=demand_model.fit()
#print(param,"AIC = "+str(demand_model_fit.aic))
par=[param,demand_model_fit.aic]
aic.append(par)
except:
continue
aic_df=pd.DataFrame(aic)
aic_df=aic_df.set_index(aic_df[0])
aic_df=aic_df[1]
aic_sort=aic_df.sort_values(ascending=True)
best_aic=aic_sort.index[0]
#print("Best values of p, d & q for ARIMA model are p = %d, d = %d & q = %d"%(best_aic[0],best_aic[1],best_aic[2]))
#print("AIC =",aic_sort[0])
#print("____________________________________________________________________")
t=len(ts)-25
demand_train=ts[0:t]
demand_test=ts[t:]
demand_forecast=demand_model_fit.forecast(steps=25)[0]
jobs_error=mean_squared_error(demand_test,demand_forecast)
#print("RSME = ",np.sqrt(jobs_error))
#print("_____________________________________________________________________")
global aic_rsme
trans_aic_rsme=pd.DataFrame({"Transformation":[Transformation],"Job":[job],"AIC":aic_sort[0],"RSME":np.sqrt(jobs_error),"p":best_aic[0],"d":best_aic[1],"q":best_aic[2]}, index=[0])
aic_rsme=aic_rsme.append(trans_aic_rsme,ignore_index=True)
print(i.upper())
print("Completed")
print("________________________________________\n")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
demand_log=np.log(d_it)
arima_model(d_it,'Log Transformation',i)
print("\n________________________________________")
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
#print(i)
demand_values=d_it.values
scaler=MinMaxScaler(feature_range=(0,1))
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_nor=demand_nor.flatten()
demand_nor=pd.Series(demand_nor,index=index)
arima_model(demand_nor,'MaxMinScaler Transfromation (Normalization)',i)
print("________________________________________")
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
demand_values=demand_values.reshape(len(demand_values),1)
std_scaler=StandardScaler()
std_scaler_fit=std_scaler.fit(demand_values)
demand_std=std_scaler.transform(demand_values)
index=d_it.index
demand_std=demand_std.flatten()
demand_std=pd.Series(demand_std,index=index)
arima_model(demand_std,'StandardScaler Transformation (Standardization)',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_sqrt=np.sqrt(d_it)
arima_model(demand_sqrt,'Square Root Transformation',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_cbrt=np.cbrt(d_it)
arima_model(demand_cbrt,'Cube Root Transformation',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_diff=d_it.diff(periods=1)
demand_diff.dropna(inplace=True)
arima_model(demand_diff,'Differencing Transformation (Shift 1)',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_diff2=d_it.diff(periods=2)
demand_diff2.dropna(inplace=True)
arima_model(demand_diff2,'Differencing Transformation (Shift 2)',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_diff3=d_it.diff(periods=3)
demand_diff3.dropna(inplace=True)
arima_model(demand_diff3,'Differencing Transformation (Shift 3)',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=PowerTransformer(method='box-cox')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_pow_bc=demand_nor.flatten()
demand_pow_bc=pd.Series(demand_pow_bc,index=index)
arima_model(demand_pow_bc,'Power Transformation ( Box-Cox )',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=PowerTransformer(method='yeo-johnson')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_pow_yj=demand_nor.flatten()
demand_pow_yj=pd.Series(demand_pow_yj,index=index)
arima_model(demand_pow_yj,'Power Transformation ( Yeo-Johnson )',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=MaxAbsScaler()
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_abs=demand_nor.flatten()
demand_abs=pd.Series(demand_abs,index=index)
arima_model(demand_abs,'MaxAbsScaler Transformation',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=RobustScaler()
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_rs=demand_nor.flatten()
demand_rs=pd.Series(demand_rs,index=index)
arima_model(demand_rs,'Robust Scaler Transformation',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=QuantileTransformer(output_distribution='uniform')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_qtu=demand_nor.flatten()
demand_qtu=pd.Series(demand_qtu,index=index)
arima_model(demand_qtu,'Quantile Transformation ( Uniform )',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=QuantileTransformer(output_distribution='normal')
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_qtn=demand_nor.flatten()
demand_qtn=pd.Series(demand_qtn,index=index)
arima_model(demand_qtn,'Quantile Transformation ( Normal )',i)
print('________________________________________')
print("Completed")
print("________________________________________")
for i in job_pre.index:
it=df1[df1['Job']==i]
d_it=it.groupby('Posted_Date')['Posted_Date'].count()
print(i)
demand_values=d_it.values
scaler=FunctionTransformer(np.log1p)
demand_values=demand_values.reshape(len(demand_values),1)
scaler_fit=scaler.fit(demand_values)
demand_nor=scaler.transform(demand_values)
index=d_it.index
#demand_nor=demand_nor.reshape(len(demand_values),1)
demand_ft=demand_nor.flatten()
demand_ft=pd.Series(demand_ft,index=index)
arima_model(demand_ft,'Function Transformation',i)
print('________________________________________')
aic_rsme
aic_rsme.to_excel('AIC_RSME.xlsx', engine="xlsxwriter")